/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ // $Id: PrefixURLFilter.java,v 1.1 2004/08/17 16:34:30 guehene Exp $ package net.nutch.net; import java.io.Reader; import java.io.FileReader; import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.IOException; import java.util.List; import java.util.ArrayList; import java.util.Iterator; import java.util.logging.Logger; import net.nutch.util.*; /** Filters URLs based on a file of URL prefixes. The config file is * named by the Nutch configuration property "urlfilter.prefix.file". * * <p>The format of this file is one URL per line.</p> */ public class PrefixURLFilter implements URLFilter { private static final Logger LOG = LogFormatter.getLogger("net.nutch.net.PrefixURLFilter"); private TrieStringMatcher trie; public PrefixURLFilter() throws IOException { String file = NutchConf.get("urlfilter.prefix.file"); Reader reader = NutchConf.getConfResourceAsReader(file); if (reader == null) { LOG.severe("Can't find resource: " + file); } else { trie = readConfigurationFile(reader); } } public PrefixURLFilter(String filename) throws IOException { trie = readConfigurationFile(new FileReader(filename)); } public String filter(String url) { if (trie.shortestMatch(url) == null) return null; else return url; } private static TrieStringMatcher readConfigurationFile(Reader reader) throws IOException { BufferedReader in=new BufferedReader(reader); List urlprefixes = new ArrayList(); String line; while((line=in.readLine())!=null) { if (line.length() == 0) continue; char first=line.charAt(0); switch (first) { case ' ' : case '\n' : case '#' : // skip blank & comment lines continue; default : urlprefixes.add(line); } } return new PrefixStringMatcher(urlprefixes); } public static void main(String args[]) throws IOException { PrefixURLFilter filter; if (args.length >= 1) filter = new PrefixURLFilter(args[0]); else filter = new PrefixURLFilter(); BufferedReader in=new BufferedReader(new InputStreamReader(System.in)); String line; while((line=in.readLine())!=null) { String out=filter.filter(line); if(out!=null) { System.out.println(out); } } } }